#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/9/8 22:23
# @Author  : LuChao
# @Site    : 
# @File    : GetPythonDjango.py
# @Software: PyCharm
import os
import pdfkit
import requests
from bs4 import BeautifulSoup


# !/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/9/8 22:23
# @Author  : LuChao
# @Site    :
# @Software: PyCharm


def get_urls():
    """
    :return: 获取左侧菜单的所有URL并返回
    """
    # base_url = 'http://www.liujiangblog.com/course/django/2'
    # response = requests.get(base_url)
    # soup = BeautifulSoup(response.content, 'html.parser')
    # 获取文档内容
    # menus = soup.find(class_='navbar-sider list-group-item').find_all("a")
    b=['http://blog.mtianyan.cn/post/8b4c6c13.html#more',]
    a = ['http://blog.mtianyan.cn/post/3c234171.html',]
    urls = a
    # for i in menus:
    #     url = 'http://www.liujiangblog.com' + i.get('href')
    #     urls.append(url)
    # print(urls)
    return urls



def get_learn_django_content(urls):
    """
    解析URL，获取需要的html内容
    :return: htmls
    """
    htmls = []

    # 渲染的html模板
    html_template = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <link rel="stylesheet" href="style.css" type="text/css" media="all">
    </head>
    <body>
    {content}
    </body>
    </html>
    """

    for k, base_url in enumerate(urls):
        # base_url = u
        print(base_url)
        response = requests.get(base_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        content = soup.find(class_='post-body')
        # 去除图片
        html = html_template.format(content=content)
        html = html.encode("UTF-8")
        html_name = str(k) + ".html"
        with open(html_name, 'wb') as f:
            f.write(html)
        htmls.append(html_name)
    return htmls


def save_pdf(htmls, name):
    """
    把所有html文件转换成pdf文件
    """
    # views视图中可以加上options进行页面布局调试　
    options = {
        'page-size': 'Letter',
        'encoding': "UTF-8",
        'custom-header': [
            ('Accept-Encoding', 'gzip')
        ]
    }
    pdfkit.from_file(htmls, name, options=options)


# 删除.html文件
def del_files(path):
    for root, dirs, files in os.walk(path):
        for name in files:
            if name.endswith(".html"):
                print(name)
                os.remove(os.path.join(root, name))


# 获取.html文件
def get_htmls(path):
    htmls = []
    for root, dirs, files in os.walk(path):
        for name in files:
            if name.endswith(".html"):
                htmls.append(os.getcwd() + '\\' + name)
                # os.remove(os.path.join(root,name))
    return htmls


if __name__ == '__main__':
    del_files(os.getcwd())
    urls = get_urls()
    save_pdf(get_learn_django_content(get_urls()), 'Scrapy分布式爬虫打造搜索引擎.pdf')
    del_files(os.getcwd())
